!pip install 'networkx<2.7'
from google.colab import files
uploaded = files.upload()
# importing package
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
import matplotlib.pyplot as plt
import networkx as nx
nltk.download('stopwords')
nltk.download('punkt')
# reading datasets
df1 = pd.read_csv('2017.csv')
df2 = pd.read_csv('2018.csv')
df3 = pd.read_csv('2019.csv')
df4 = pd.read_csv('2020.csv')
df5 = pd.read_csv('2021.csv')
df6 = pd.read_csv('2022.csv')
data = pd.concat([df1,df2,df3,df4,df5,df6])
data
data = data.loc[:,['date','tweet']]
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data
# word frequency
def wordfre(data,year):
data1 = data[data['year']==year]
stop = set(stopwords.words('english'))
other = ["http","https", ".", "!", "?", ",", ";", ":", "-", "[", "]", "{", "}", "(", ")", "/", "*", "~", "<", ">", "`", "^", "_", "|", "#", "$", "%", "+", "=", "&", "@", " "]
wordf =[]
wordfrequency = pd.DataFrame()
for i in range(len(data1)):
word = word_tokenize(data1.iloc[i]['tweet'])
for j in range(len(word)):
if word[j].lower() not in stop and word[j].lower() not in other and word[j].lower().isalpha():
wordf.append(word[j].lower())
wordfrequency['word'] = wordf
wordfrequency = wordfrequency.reset_index()
frequency = wordfrequency.groupby('word').count()
frequency = pd.DataFrame(frequency)
frequency = frequency.reset_index()
frequency = frequency.rename(columns={'index':'count'})
frequency = frequency.sort_values('count',ascending=False)
return frequency
#function for Zipf's law
def zip(data):
a = data
a = a.reset_index().reset_index().drop('index', axis = 1).rename(columns = {'level_0':'rank'})
a['rank'] = a['rank']+1
a['frequency'] = a['count']/sum(a['count'])
return a
#Function for loglog plot
def drawlog(data):
fig = plt.figure(figsize = (10,10))
ax = plt.subplot(111)
ax.plot(data['rank'],data['frequency'])
ax.set_xlabel('rank')
ax.set_ylabel('frequency')
ax.set_xscale('log')
ax.set_yscale('log')
plt.show()
#fuction defining bigram
def bigcount(data,year):
data1 = data[data['year']==year]
stop = set(stopwords.words('english'))
pattern = '|'.join(stop)
other = ["http","https", ".", "!", "?", ",", ";", ":", "-", "[", "]", "{", "}", "(", ")", "/", "*", "~", "<", ">", "`", "^", "_", "|", "#", "$", "%", "+", "=", "&", "@", " "]
p2 = '|'.join(other)
wordf =[]
bia =[]
word1 =[]
word2 =[]
bigramsl = pd.DataFrame()
for i in range(len(data1)):
word = word_tokenize(data1.iloc[i]['tweet'])
bi = list(nltk.bigrams(word))
bia = bia+bi
for j in range(len(bia)):
word1.append(bia[j][0])
word2.append(bia[j][1])
bigramsl['word1'] = word1
bigramsl['word2'] = word2
# without stop word
bigramsl = bigramsl[~bigramsl['word1'].str.contains(pattern)&bigramsl['word1'].str.contains('[A-Za-z]')&~bigramsl['word1'].str.contains(pattern)& bigramsl['word2'].str.contains('[A-Za-z]')]
bigramsl = bigramsl.reset_index()
frequency = bigramsl.groupby(['word1','word2']).count()
frequency = pd.DataFrame(frequency)
frequency = frequency.reset_index()
frequency = frequency.rename(columns={'index':'count'})
frequency = frequency.sort_values('count',ascending=False)
return frequency
def bivi(data):
G = nx.Graph()
for i in range(len(data)):
G.add_edge(data.iloc[i,0], data.iloc[i,1], weight=data.iloc[i,2])
fig, ax = plt.subplots(figsize=(20, 20))
pos = nx.spring_layout(G,k = 2)
nx.draw(G,pos, with_labels=True, node_size=25)
plt.show()
# year 2017
wordfre(data,2017)
# top 10
wordfre(data,2017)[:10]
# hist
a = wordfre(data,2017)
fig, ax = plt.subplots(figsize=(20,5))
ax.hist(a['count'],100 )
ax.set_xlabel("Count",fontsize=25 )
ax.set_ylabel("Frequency",fontsize=25)
plt.show()
# zip laws
a = wordfre(data,2017)
zip(a)
# plot log-log
a = zip(a)
drawlog(a)
# bigrams
# count
bigcount(data,2017)
# visualization
a = bigcount(data,2017)
bivi(a)
# year 2018 word count
wordfre(data,2018)
# top 10
wordfre(data,2018)[:10]
# hist
a = wordfre(data,2018)
fig, ax = plt.subplots(figsize=(20,5))
ax.hist(a['count'],100 )
ax.set_xlabel("Count",fontsize=25 )
ax.set_ylabel("Frequency")
plt.show()
# zip laws
a = wordfre(data,2018)
zip(a)
# plot log-log
a = zip(a)
drawlog(a)
# bigrams
# count
bigcount(data,2018)
# visualization
a = bigcount(data,2018)
bivi(a)
# year 2019 word count
wordfre(data,2019)
# top 10
wordfre(data,2019)[:10]
# hist
a = wordfre(data,2019)
fig, ax = plt.subplots(figsize=(20,5))
ax.hist(a['count'],100 )
ax.set_xlabel("Count",fontsize=25 )
ax.set_ylabel("Frequency")
plt.show()
# zip laws
a = wordfre(data,2019)
zip(a)
# plot log-log
a = zip(a)
drawlog(a)
# bigrams
# count
bigcount(data,2019)
# visualization
a = bigcount(data,2019)
bivi(a)
# year 2020 word count
wordfre(data,2020)
# top 10
wordfre(data,2020)[:10]
# hist
a = wordfre(data,2020)
fig, ax = plt.subplots(figsize=(20,5))
ax.hist(a['count'],100 )
ax.set_xlabel("Count",fontsize=25 )
ax.set_ylabel("Frequency")
plt.show()
# zip laws
a = wordfre(data,2020)
zip(a)
# plot log-log
a = zip(a)
drawlog(a)
# bigrams
# count
bigcount(data,2020)
# visualization
a = bigcount(data,2020)
bivi(a)
# year 2021 word count
wordfre(data,2021)
# top 10
wordfre(data,2021)[:10]
# hist
a = wordfre(data,2021)
fig, ax = plt.subplots(figsize=(20,5))
ax.hist(a['count'],100 )
ax.set_xlabel("Count",fontsize=25 )
ax.set_ylabel("Frequency")
plt.show()
# zip laws
a = wordfre(data,2021)
zip(a)
# plot log-log
a = zip(a)
drawlog(a)
# bigrams
# count
bigcount(data,2021)
# visualization
a = bigcount(data,2021)
bivi(a)
# year 2022 word count
wordfre(data,2022)
# top 10
wordfre(data,2022)[:10]
# hist
a = wordfre(data,2022)
fig, ax = plt.subplots(figsize=(20,5))
ax.hist(a['count'],100 )
ax.set_xlabel("Count",fontsize=25 )
ax.set_ylabel("Frequency",fontsize=25)
plt.show()
# zip laws
a = wordfre(data,2022)
zip(a)
# plot log-log
a = zip(a)
drawlog(a)
# bigrams
# count
bigcount(data,2022)
# visualization
a = bigcount(data,2022)
bivi(a)
%%shell
jupyter nbconvert --to html /PATH/TO/YOUR/NOTEBOOKFILE.ipynb